{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import keras\n",
    "import math\n",
    "import random\n",
    "import codecs\n",
    "import gc\n",
    "\n",
    "\n",
    "from keras.models import Sequential,Model\n",
    "from keras.layers import Dense, Embedding, LSTM,Flatten,SpatialDropout1D,Dropout,Input\n",
    "from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D\n",
    "\n",
    "\n",
    "# BERT files\n",
    "os.listdir(\"../input/pretrained-bert-including-scripts/master/bert-master\")\n",
    "sys.path.insert(0, '../input/pretrained-bert-including-scripts/master/bert-master')\n",
    "!cp -r '../input/kerasbert/keras_bert' '/kaggle/working'\n",
    "BERT_PRETRAINED_DIR = '../input/pretrained-bert-including-scripts/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12'\n",
    "\n",
    "from keras_bert.keras_bert import Tokenizer\n",
    "from keras_bert.keras_bert import get_base_dict, get_model, gen_batch_inputs\n",
    "from keras_bert.keras_bert import load_trained_model_from_checkpoint\n",
    "\n",
    "from run_classifier import *\n",
    "import tokenization\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')\n",
    "test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')\n",
    "\n",
    "train['comment_text'] = train['comment_text'].replace({r'\\s+$': '', r'^\\s+': ''}, regex=True).replace(r'\\n',  ' ', regex=True)\n",
    "test['comment_text'] = test['comment_text'].replace({r'\\s+$': '', r'^\\s+': ''}, regex=True).replace(r'\\n',  ' ', regex=True)\n",
    "\n",
    "x_train = preprocess(train['comment_text'])\n",
    "y_train = np.where(train['target'] >= 0.5, 1, 0)\n",
    "\n",
    "del train\n",
    "gc.collect()\n",
    "x_test = preprocess(test['comment_text'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Colocations handled automatically by placer.\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n"
     ]
    }
   ],
   "source": [
    "config_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')\n",
    "dict_file = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')\n",
    "checkpoint_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')\n",
    "bert_model = load_trained_model_from_checkpoint(config_file, checkpoint_file,seq_len=32)\n",
    "\n",
    "\n",
    "token_dict = {}\n",
    "with codecs.open(dict_file, 'r', 'utf8') as reader:\n",
    "    for line in reader:\n",
    "        token = line.strip()\n",
    "        token_dict[token] = len(token_dict)\n",
    "tokenizer = Tokenizer(token_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "bidirectional_1 (Bidirection (None, 32, 256)           919552    \n",
      "_________________________________________________________________\n",
      "spatial_dropout1d_1 (Spatial (None, 32, 256)           0         \n",
      "_________________________________________________________________\n",
      "bidirectional_2 (Bidirection (None, 32, 256)           395264    \n",
      "_________________________________________________________________\n",
      "flatten_1 (Flatten)          (None, 8192)              0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 512)               4194816   \n",
      "_________________________________________________________________\n",
      "dropout_1 (Dropout)          (None, 512)               0         \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 256)               131328    \n",
      "_________________________________________________________________\n",
      "dropout_2 (Dropout)          (None, 256)               0         \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 2)                 514       \n",
      "=================================================================\n",
      "Total params: 5,641,474\n",
      "Trainable params: 5,641,474\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "model = Sequential()\n",
    "model.add(Bidirectional(CuDNNLSTM(128,return_sequences=True),input_shape=(32,768)))\n",
    "model.add(SpatialDropout1D(0.3))\n",
    "model.add(Bidirectional(CuDNNLSTM(128,return_sequences=True)))\n",
    "model.add(Flatten())\n",
    "model.add(Dense(512,activation='relu'))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(256,activation='relu'))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(2,activation='softmax'))\n",
    "model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])\n",
    "print(model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DataGenerator(keras.utils.Sequence):\n",
    "\n",
    "    def __init__(self, dataX, dataY, batch_size=1, shuffle=True):\n",
    "        self.batch_size = batch_size\n",
    "        self.dataX = dataX\n",
    "        self.dataY = dataY\n",
    "        # 验证dataX训练数据和标签是否数量一致\n",
    "        assert(len(self.dataX)==len(self.dataY))\n",
    "        self.indexes = np.arange(len(self.dataX))\n",
    "        self.shuffle = shuffle\n",
    "        if self.shuffle == True:\n",
    "            np.random.shuffle(self.indexes)\n",
    "\n",
    "    def __len__(self):\n",
    "        #计算每一个epoch的迭代次数\n",
    "        return math.ceil(len(self.dataX) / float(self.batch_size))\n",
    "\n",
    "    def __getitem__(self, index):\n",
    "        # 生成每个batch数据\n",
    "        # 生成batch_size个索引\n",
    "        batch_indexs = self.indexes[index*self.batch_size:(index+1)*self.batch_size]\n",
    "        # 根据索引获取datas集合中的数据\n",
    "        batch_X = [self.dataX[k] for k in batch_indexs]\n",
    "        batch_Y = [self.dataY[k] for k in batch_indexs]\n",
    "\n",
    "        # 生成数据\n",
    "        X, y = self.data_generation(batch_X, batch_Y)\n",
    "        return X, y\n",
    "\n",
    "    def on_epoch_end(self):\n",
    "        #在每一次epoch结束是否需要进行一次随机，重新随机一下index\n",
    "        if self.shuffle == True:\n",
    "            np.random.shuffle(self.indexes)\n",
    "\n",
    "    def data_generation(self, batch_X, batch_Y):\n",
    "        y=[]\n",
    "        # 生成数据\n",
    "        indices=[]\n",
    "        segments=[]\n",
    "        \n",
    "        for i, text in enumerate(batch_X):\n",
    "            index, segment = tokenizer.encode(first=text, max_len=32)\n",
    "            indices.append(index)\n",
    "            segments.append(segment)\n",
    "            \n",
    "        word_vec = bert_model.predict([np.array(indices), np.array(segments)])\n",
    "        for label in batch_Y:\n",
    "            if(label==1):\n",
    "                y.append([0,1])\n",
    "            else:\n",
    "                y.append([1,0])\n",
    "        \n",
    "        return word_vec, np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataGen = DataGenerator(x_train,y_train,64,True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(64, 32, 768)\n"
     ]
    }
   ],
   "source": [
    "for x,y in dataGen:\n",
    "    print(x.shape)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n",
      "Epoch 1/1\n",
      " 5966/28202 [=====>........................] - ETA: 53:50 - loss: 0.1998 - acc: 0.9337"
     ]
    }
   ],
   "source": [
    "model.fit_generator(dataGen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 97320/97320 [01:13<00:00, 1331.03it/s]\n"
     ]
    }
   ],
   "source": [
    "indices=[]\n",
    "segments=[]\n",
    "import tqdm\n",
    "for text in tqdm.tqdm(x_test):\n",
    "    index, segment = tokenizer.encode(first=text, max_len=32)\n",
    "    indices.append(index)\n",
    "    segments.append(segment)\n",
    "test_data = [np.array(indices), np.array(segments)]\n",
    "test_vec = bert_model.predict(test_data)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(97320, 32, 768)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_vec.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del x_test\n",
    "del dataGen\n",
    "del test_data\n",
    "del indices\n",
    "del segments\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = model.predict_proba(test_vec,batch_size=32)[:,1]\n",
    "submission = pd.DataFrame.from_dict({\n",
    "    'id': test['id'],\n",
    "    'prediction': predictions\n",
    "})\n",
    "\n",
    "submission.to_csv('submission.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
